Preliminary settings

Libraries

library(readr)
library(stringr)
library(dplyr)
library(here)

library(ggplot2)

library(mogo)
library(clusterProfiler)
library(viridis)

Paths

Let’s set the paths to the directory containing the raw data and to the directory to which we will save our results.

raw_data_path = paste(here(), "data", "raw", sep = "/")
result_path = paste(here(), "results", sep = "/")
graphics_path = paste(here(), "graphics", sep = "/")

Cluster 1

Load data

Read data from csv file and select the transcript identifiers (e.g. MGG_07200T0).

cluster1 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 1 _on_guy11_and_dpmk1.csv') %>% 
  dplyr::select(id)

Convert transcript identifier to gene identifier (e.g. from MGG_07200T0 to MGG_07200) and collapse duplicate gene identifiers: for example, transcript MGG_07200T0 (gene MGG_07200) appears twice in cluster 1 because it correspond to a protein with two phosphorylation sites ([1-38]-1xMet-loss+Acetyl [N-Term];1xPhospho [S20(91.8)] and [1-38]-1xMet-loss+Acetyl [N-Term];1xPhospho [T16(96.9)]); in this case, we only keep the one instance of the transcript/gene identifier to avoid over-estimating the representation of the GO term(s) associated with this transcript/gene identifier.

clust1 = cluster1 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
  mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
  dplyr::select(gene_ids) %>% 
  unique() # collapse duplicates of the same gene identifier

GO Enrichment

Perform the GO enrichment analysis

enrich1 <- do_enrich(clust1$gene_ids)
barplot(enrich1) + 
  scale_fill_viridis(direction = -1) + 
  ggplot2::theme(aspect.ratio = 1)

dotplot(enrich1) + 
  scale_colour_viridis(direction = -1)

Write results in CSV

enrich1_df = as.data.frame(enrich1) %>%
  select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)

csv_filename = paste(result_path, "011_cluster1_go_enrichment.csv", sep = "/")
write_csv(enrich1_df, csv_filename)

Cluster 2

Load data

Read data from csv file.

cluster2 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 2 _on_guy11_and_dpmk1.csv') %>% 
  dplyr::select(id)

Keep only the gene id

clust2 = cluster2 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
  mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
  dplyr::select(gene_ids) %>% 
  unique()

GO Enrichment

enrich2 <- do_enrich(clust2$gene_ids)
barplot(enrich2) + 
  scale_fill_viridis(direction = -1) + 
  ggplot2::theme(aspect.ratio = 1)

dotplot(enrich2) + 
  scale_colour_viridis(direction = -1)

Write results in CSV

enrich2_df = as.data.frame(enrich2) %>%
  select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)

csv_filename = paste(result_path, "011_cluster2_go_enrichment.csv", sep = "/")
write_csv(enrich2_df, csv_filename)

Cluster 3

Load data

Read data from csv file.

cluster3 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 3 _on_guy11_and_dpmk1.csv') %>% dplyr::select(id)

Keep only the gene id

clust3 = cluster3 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
  mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
  dplyr::select(gene_ids) %>% unique()

GO Enrichment

enrich3 <- do_enrich(clust3$gene_ids)
barplot(enrich3) + scale_fill_viridis(direction = -1) + ggplot2::theme(aspect.ratio = 1)

dotplot(enrich3) + scale_colour_viridis(direction = -1)

Write results in CSV

enrich3_df = as.data.frame(enrich3) %>%
  select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)

csv_filename = paste(result_path, "011_cluster3_go_enrichment.csv", sep = "/")
write_csv(enrich3_df, csv_filename)

Cluster 4

Load data

Read data from csv file.

cluster4 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 4 _on_guy11_and_dpmk1.csv') %>% 
  dplyr::select(id)

Keep only the gene id

clust4 = cluster4 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
  mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
  dplyr::select(gene_ids) %>% 
  unique()

GO Enrichment

enrich4 <- do_enrich(clust4$gene_ids)
barplot(enrich4) + 
  scale_fill_viridis(direction = -1) + 
  ggplot2::theme(aspect.ratio = 1)

dotplot(enrich4) + 
  scale_colour_viridis(direction = -1)

Write results in CSV

enrich4_df = as.data.frame(enrich4) %>%
  select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)

csv_filename = paste(result_path, "011_cluster4_go_enrichment.csv", sep = "/")
write_csv(enrich3_df, csv_filename)

Cluster 5

Load data

Read data from csv file.

cluster5 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 5 _on_guy11_and_dpmk1.csv') %>% 
  dplyr::select(id)

Keep only the gene id

clust5 = cluster5 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
  mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
  dplyr::select(gene_ids) %>% 
  unique()

GO Enrichment

enrich5 <- do_enrich(clust5$gene_ids)
barplot(enrich5) + 
  scale_fill_viridis(direction = -1) + 
  ggplot2::theme(aspect.ratio = 1)

dotplot(enrich5) + 
  scale_colour_viridis(direction = -1)

Write results in CSV

enrich5_df = as.data.frame(enrich5) %>%
  select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)

csv_filename = paste(result_path, "011_cluster5_go_enrichment.csv", sep = "/")
write_csv(enrich5_df, csv_filename)

Selected protein target

Load data

Read data from csv file.

data_filename = paste(raw_data_path, "Pmk1 direct targets PRM_101122.xlsx", sep = "/")

targets = readxl::read_xlsx(data_filename) %>% 
  dplyr::select(`Phosphorylation position`) %>% unique()

Keep only the gene id (for now I remove gi number) Replacement:

gi|145610929|ref|XP_368444.2| MST7 = MGG_00800 gi|39964959|ref|XP_365053.1| MAC1 (S75) = MGG_09898

targets2 = targets %>% 
  dplyr::mutate(gene_ids = 
      stringr::str_split(`Phosphorylation position`, " ", n = 2, simplify = TRUE)[,1]) %>%
  dplyr::mutate(gene_ids = ifelse(`gene_ids` == 'gi|145610929|ref|XP_368444.2|', 
    'MGG_00800', gene_ids)) %>%
  dplyr::mutate(gene_ids = ifelse(`gene_ids` == 'gi|39964959|ref|XP_365053.1|', 
    'MGG_09898', gene_ids))

In several cases, few selected peptides correspond to the same protein and thus the different lines/rows (peptides) are collapsed so that a single MGG number (gene/protein) is taken into consideration in the GO enrichment analysis resulting in a list of 31 proteins.

gene_ids = targets2 %>% 
  dplyr::mutate(gene_ids = stringr::str_split(`Phosphorylation position`, " ", n = 2, simplify = TRUE)[,1]) %>% 
  dplyr::filter(str_detect(gene_ids, "^MGG")) %>% 
  dplyr::select(gene_ids)  %>% 
  unique()

GO Enrichment

enrich_targets <- do_enrich(gene_ids$gene_ids)
barplot(enrich_targets) + 
  #scale_fill_viridis(direction = -1) + 
  ggplot2::scale_fill_distiller(type = "seq", direction = -1, palette = 'Reds') +
  ggplot2::theme(aspect.ratio = 1)

# PRM-GO-dotplot.pdf
dotplot(enrich_targets) + 
  ggplot2::scale_colour_distiller(type = "seq", direction = -1, palette = 'Reds', limits = c(0, 0.005)) 

# limits and directions are set so that the more intense the colour is, more smaller/better the p-value is

Write results in CSV

enrich_targets_df = as.data.frame(enrich_targets) %>%
  select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)

csv_filename = paste(result_path, "011_targets_go_enrichment.csv", sep = "/")
write_csv(enrich_targets_df, csv_filename)

Summary

February 2023: New request from Neftaly.

Let’s summarise all the go enrichement analysis in a single dot plot.

First, for all the dataframe build previously, we add a column with the cluster number (as categorial variable, not numerical).

enrich1_df$cluster = "1"
enrich2_df$cluster = "2"
enrich3_df$cluster = "3"
enrich4_df$cluster = "4"
enrich5_df$cluster = "5"

Now we can bind all these data frames together.

enrichment_summary_df = rbind(enrich1_df, enrich2_df, 
  enrich3_df, enrich4_df, enrich5_df)

We transform the current gene ratio (50/100) to its numerical value (0.5).

enrichment_summary_df = enrichment_summary_df |> mutate(
  gene.ratio = sapply(strsplit(enrichment_summary_df$GeneRatio, "/"), 
    function(x) as.numeric(x[1])/as.numeric(x[2]))) 

We draw th plot

ggplot(enrichment_summary_df,
  aes(x = cluster, y = Description, size = gene.ratio, colour = p.adjust)) + 
  geom_point()

plot_filename = paste(graphics_path, "011-go-enrichment-summary.pdf", sep = "/")
ggsave(file = plot_filename,
  plot = last_plot(), width = 10, height = 7)

Sort out these GO terms!

library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:clusterProfiler':
## 
##     rename
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:clusterProfiler':
## 
##     slice
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(viridis)

ontology = unlist(Ontology(GOTERM))
enrichment_summary_df$ont = ontology[enrichment_summary_df$ID]

enrichment_summary_df = enrichment_summary_df |> mutate(
  labels = paste0(Description, " (", ID, ")")
  ) 

ggplot(enrichment_summary_df,
  aes(x = cluster, y = labels, size = gene.ratio, colour = p.adjust)) + 
  #scale_color_viridis(direction = -1) +
  geom_point() + facet_grid(ont~., scales = "free", space = "free")

plot_filename = paste(graphics_path, "011-go-enrichment-summary-v1.pdf", sep = "/")
ggsave(file = plot_filename,
  plot = last_plot(), width = 10, height = 7)

What are these GO terms that are displayed as “NA”.

After discussion with Neftaly, we will remove the obsolete term and move the ATP-dependant microtubule to MF:

cleaned_enrichment_summary_df = enrichment_summary_df %>%
  filter(labels != "obsolete signal transducer activity (GO:0004871)") %>%
  mutate(ont = ifelse(labels == "ATP-dependent microtubule motor activity (GO:1990939)", "MF", ont))
ggplot(cleaned_enrichment_summary_df,
  aes(x = cluster, y = labels, size = gene.ratio, colour = p.adjust)) + 
  scale_color_viridis(direction = -1) +
  geom_point() + 
  facet_grid(ont~., scales = "free", space = "free")

plot_filename = paste(graphics_path, "011-go-enrichment-summary-v2.pdf", sep = "/")
ggsave(file = plot_filename,
  plot = last_plot(), width = 7, height = 10)
ggplot(enrichment_summary_df,
  aes(
    #x = cluster,
    x = ordered(cluster, levels = c("5", "4", "3", "2", "1")), 
    y = labels, size = gene.ratio, colour = p.adjust)) + 
  scale_color_viridis(option = "B", direction = -1) +
  geom_point() + facet_grid(~ont, scales = "free", space = "free") +
 coord_flip() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + labs(x = "clusters")

plot_filename = paste(graphics_path, "011-go-enrichment-summary-v3.pdf", sep = "/")
ggsave(file = plot_filename,
  plot = last_plot(), width = 10, height = 7)